Part 1: Logistic Regression

Loading and Prepping Data

# Load the following packages needed for modeling in this assignment
  
  require(caret)
## Loading required package: caret
## Loading required package: ggplot2
## Loading required package: lattice
  require(recipes)
## Loading required package: recipes
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 
## Attaching package: 'recipes'
## The following object is masked from 'package:stats':
## 
##     step
  require(finalfit)
## Loading required package: finalfit
  require(glmnet)
## Loading required package: glmnet
## Loading required package: Matrix
## Loaded glmnet 4.1-2
  require(ModelMetrics)
## Loading required package: ModelMetrics
## 
## Attaching package: 'ModelMetrics'
## The following objects are masked from 'package:caret':
## 
##     confusionMatrix, precision, recall, sensitivity, specificity
## The following object is masked from 'package:base':
## 
##     kappa
# Import the tweet dataset with embeddings

tweet <- read.csv('https://raw.githubusercontent.com/uo-datasci-specialization/c4-ml-fall-2021/main/content/post/hw2/data/hw1_tweet_final.csv',header=TRUE)



# Recipe for the tweet dataset

blueprint_tweet <- recipe(x  = tweet,
                          vars  = colnames(tweet),
                          roles = c('outcome',rep('predictor',772))) %>%
  step_dummy('month',one_hot=TRUE) %>% 
  step_harmonic('day',frequency=1,cycle_size=7, role='predictor') %>%
  step_harmonic('date',frequency=1,cycle_size=31,role='predictor') %>%
  step_harmonic('hour',frequency=1,cycle_size=24,role='predictor') %>%
  step_normalize(paste0('Dim',1:768)) %>%
  step_normalize(c('day_sin_1','day_cos_1',
                   'date_sin_1','date_cos_1',
                   'hour_sin_1','hour_cos_1')) %>%
  step_num2factor(sentiment,
                  transform = function(x) x + 1,
                  levels=c('Negative','Positive'))

  
    # Notice that I explicitly specified role=predictor when using
    # step_harmonic function. This assures that the newly derived sin and cos
    # variables has a defined role.
    # You need to do this otherwise caret::train function breaks.
    # caret_train requires every variable in the recipe to have a role
    
    # You can run the following code and make sure every variable has a defined 
    # role. If you want to experiment, remove the role=predictor argument
    # in the step_harmonic function, create the recipe again, and run the following
    # you will see that the new sin and cos variables have NA in the column role
    # and this breaks the caret::train function later.
  
    # Also, in the last line, we transform the outcome variable 'sentiment' to 
    # a factor with labels. 
    # This seems necessary for fitting logistic regression via caret::train

    print(blueprint_tweet %>% prep() %>% summary)
## # A tibble: 781 x 4
##    variable  type    role      source  
##    <chr>     <chr>   <chr>     <chr>   
##  1 sentiment nominal outcome   original
##  2 day       numeric predictor original
##  3 date      numeric predictor original
##  4 hour      numeric predictor original
##  5 Dim1      numeric predictor original
##  6 Dim2      numeric predictor original
##  7 Dim3      numeric predictor original
##  8 Dim4      numeric predictor original
##  9 Dim5      numeric predictor original
## 10 Dim6      numeric predictor original
## # ... with 771 more rows

Task 1.1

Split the original data into two subsets: training and test. Let the training data have the 80% of cases and the test data have the 20% of the cases.

set.seed(11142021)  # for reproducibility
  
loc      <- sample(1:nrow(tweet), round(nrow(tweet) * 0.8))
tweet_train  <- tweet[loc, ]
tweet_test  <- tweet[-loc, ]

tweet_train
tweet_test
prepare <- prep(blueprint_tweet, 
                training = tweet_train)

baked_train <- bake(prepare, new_data = tweet_train)
  
baked_test <- bake(prepare, new_data = tweet_test)

Task 1.2

Use the caret::train() function to train a model with 10-fold cross-validation for predicting the probability of sentiment being positive using logistic regression without any regularization. Evaluate and report the performance of the model on the test dataset.

##Making a crossfold function

crossfold_log <- function(training_data, folds){
  
  #shuffle data
  traning_data <- training_data[sample(nrow(training_data)),]
  

    # Create 10 folds with equal size

      N_folds = cut(seq(1,nrow(training_data)),breaks= folds,labels=FALSE)
  
    # Create the list for each fold 
      
      my.indices <- vector('list',folds)
      for(i in 1:folds){
        my.indices[[i]] <- which(N_folds!=i)
      }
      
      #cross validation settings
      
      cv <- trainControl(method    = "cv",
                   index           = my.indices,
                   classProbs      = TRUE,
                   summaryFunction = mnLogLoss)
      
      
return(cv)
}

#Making an accuracy function 

accuracy <- function(observed_vector, predicted_vector){
tab <- table(predicted_vector,
             observed_vector,
             dnn = c('Predicted','Observed'))



tn <- tab[1,1]
tp <- tab[2,2]
fp <- tab[2,1]
fn <- tab[1,2]

acc <- (tp + tn)/(tp+tn+fp+fn)

return(acc)
}
crossfold_tweet <- crossfold_log(tweet_train, 10)

mod_1 <- caret::train(blueprint_tweet, 
                          data      = tweet_train, 
                          method    = "glm", 
                          trControl = crossfold_tweet)
## Warning in train.recipe(blueprint_tweet, data = tweet_train, method = "glm", :
## The metric "Accuracy" was not in the result set. logLoss will be used instead.
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
predicted_test <- predict(mod_1, tweet_test) %>% 
  as.numeric() %>% -1 #subtracting 1 b/c factor levels came out as 1/2 rather than 0/1
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
observed_test <- tweet_test$sentiment %>% as.numeric()


##Eval Metric/s
print("Error Variance")
## [1] "Error Variance"
RMSE_test <- RMSE(predicted_test, observed_test)
cat("Test RMSEA is", RMSE_test, "\n")
## Test RMSEA is 0.5477226
mae_test <- MAE(predicted_test, observed_test)
cat("Test MAE is", mae_test, "\n")
## Test MAE is 0.3
print("Accuracy and Prediction Stats")
## [1] "Accuracy and Prediction Stats"
#LogLoss
LL <- logLoss(observed_test, predicted_test)
cat("LogLoss is", LL, "\n")
## LogLoss is 10.36175
#AUC
AUC <- auc(observed_test, predicted_test)
cat("Area Under Curve is", AUC, "\n")
## Area Under Curve is 0.6993452
#Accuracy
ACC <- accuracy(observed_test, predicted_test)
cat("Accuracy is", ACC, "\n")
## Accuracy is 0.7
#True Positive Rate
TPR <- tpr(observed_test, predicted_test, cutoff = .5)
cat("True Positive Rate is", TPR, "\n")
## True Positive Rate is 0.6853147
#True Negative Rate
TNR <- tnr(observed_test, predicted_test, cutoff = .5)
cat("True Negative Rate is", TNR, "\n")
## True Negative Rate is 0.7133758
#Precision
PRE <- precision(observed_test, predicted_test, cutoff = .5)
cat("Precision is", PRE, "\n")
## Precision is 0.6853147
mod_1_stats <- c("Unregularized Regression", LL, AUC, TPR, TNR, PRE)

Got errors for this, glm algorithm didn’t emerge. Model performance stats - LogLoss is 10.36175 Area Under Curve is 0.6993452 Accuracy is 0.7 True Positive Rate is 0.6853147 True Negative Rate is 0.7133758 Precision is 0.6853147

This first prediction model is a little better than chance (which would be ~.5 for prediction metrics). Accuracy stats for both TPR and TNR are similar, meaning similar chance of being identified correctly as positive, when positive tweet, and negative when negative tweet. Tweets will be classified correctly ~70% of time.

Task 1.3.

Use the caret::train() function to train a model with 10-fold cross-validation for predicting the probability of sentiment being positive using logistic regression with ridge penalty. Try different values of ridge penalty to decide the optimal value. Use logLoss as a metric for optimization. Plot the results, and report the optimal value of ridge penalty.

crossfold_tweet <- crossfold_log(tweet_train, 10)

ridge_grid <- data.frame(alpha = 0, lambda = c(seq(0.21320, 0.21332, .000001)))

mod_2 <- caret::train(blueprint_tweet, 
                      data = tweet_train,
                         method    = "glmnet",
                         family    = 'binomial',
                         metric    = 'logLoss',
                        trControl = crossfold_tweet,
                                     tuneGrid  = ridge_grid)
mod_2$bestTune
predicted_test <- predict(mod_2, tweet_test) %>% 
  as.numeric() %>% -1 #subtracting 1 b/c factor levels came out as 1/2 rather than 0/1

observed_test <- tweet_test$sentiment %>% as.numeric()

plot(mod_2)

#LogLoss
LL <- logLoss(observed_test, predicted_test)
  
#AUC
AUC <- auc(observed_test, predicted_test)


#Accuracy
ACC <- accuracy(observed_test, predicted_test)

#True Positive Rate
TPR <- tpr(observed_test, predicted_test, cutoff = .5)

#True Negative Rate
TNR <- tnr(observed_test, predicted_test, cutoff = .5)

#Precision
PRE <- precision(observed_test, predicted_test, cutoff = .5)

mod_2_stats <- c("Ridge Regression", LL, AUC, TPR, TNR, PRE)
  1. .001 to 3 by .01. (Used .211)
  2. .2 to .3 by .0005 (Used 0.2135)
  3. .212 to .216 by .00003 (Used 0.21326)
  4. 0.21320 to 0.21332 by .000001 (Used 0.21325)

Stopped since this seems to be narrowing in pretty close to optimized value. Optimal lambda value = 0.21325

Task 1.4.

Use the caret::train() function to train a model with 10-fold cross-validation for predicting the probability of sentiment being positive using logistic regression with lasso penalty. Try different values of lasso penalty to decide optimal value. Use logLoss as a metric for optimization. Plot the results, and report the optimal value of lasso penalty.

crossfold_tweet <- crossfold_log(tweet_train, 10)

lasso_grid <- data.frame(alpha = 1, lambda = c(seq(.01075, .0108, .0000001)))

mod_3 <- caret::train(blueprint_tweet, 
                      data = tweet_train, 
                         method    = "glmnet",
                         family    = 'binomial',
                         metric    = 'logLoss',
                        trControl = crossfold_tweet,
                                     tuneGrid  = lasso_grid)

mod_3$bestTune
predicted_test <- predict(mod_3, tweet_test) %>% 
  as.numeric() %>% -1 #subtracting 1 b/c factor levels came out as 1/2 rather than 0/1

observed_test <- tweet_test$sentiment %>% as.numeric()

plot(mod_3)

#LogLoss
LL <- logLoss(observed_test, predicted_test)
  
#AUC
AUC <- auc(observed_test, predicted_test)

#Accuracy
ACC <- accuracy(observed_test, predicted_test)

#True Positive Rate
TPR <- tpr(observed_test, predicted_test, cutoff = .5)

#True Negative Rate
TNR <- tnr(observed_test, predicted_test, cutoff = .5)

#Precision
PRE <- precision(observed_test, predicted_test, cutoff = .5)

mod_3_stats <- c("Lasso Regression", LL, AUC, TPR, TNR, PRE)

Test parameter sets: 1) 0 to 3 by .01 (Used 0.01) 2) .0001 to .5 by .004 (Used 0.0121) 3) 0.01 to .02, by .00005 (Used 0.0108) 4) 0.01 to .012 by .000005 (Used 0.010785) 5) .0107 to .0109 by .0000005 (Used 0.010789) 6) .01075 to .0108 by .0000001 (Used 0.010789)

Stopped since it seems like a local minimum has been reached. Optimal lambda value is 0.010789.

Task 1.5

Evaluate the performance of the models in 1.2, 1.3, and 1.4 on the test dataset. Calculate and report logLoss (LL), area under the reciever operating characteristic curver (AUC), overall accuracy (ACC), true positive rate (TPR), true negative rate (TNR), and precision (PRE) for three models. When calculating ACC, TPR, TNR, and PRE, assume that we use a cut-off value of 0.5 for the predicted probabilities. Summarize these numbers in a table like the following. Decide and comment on which model you would use to predict sentiment of a tweet moving forward.

Stats - LL AUC ACC TPR TNR PRE

Regression Types - Logistic Regression
Logistic Regression with Ridge Penalty
Logistic Regression with Lasso Penalty

names <- c("LL", "AUC", "ACC", "TNR", "PRE")


print(rbind(names, mod_1_stats, mod_2_stats, mod_3_stats))
## Warning in rbind(names, mod_1_stats, mod_2_stats, mod_3_stats): number of
## columns of result is not a multiple of vector length (arg 1)
##             [,1]                       [,2]               [,3]               
## names       "LL"                       "AUC"              "ACC"              
## mod_1_stats "Unregularized Regression" "10.3617528580877" "0.699345240746515"
## mod_2_stats "Ridge Regression"         "6.56244747477605" "0.81005300432052" 
## mod_3_stats "Lasso Regression"         "6.79269798810115" "0.802124626965391"
##             [,4]                [,5]                [,6]               
## names       "TNR"               "PRE"               "LL"               
## mod_1_stats "0.685314685314685" "0.713375796178344" "0.685314685314685"
## mod_2_stats "0.811188811188811" "0.808917197452229" "0.794520547945205"
## mod_3_stats "0.776223776223776" "0.828025477707006" "0.804347826086957"

I would use ridge model, due to higher accuracy. There is a slight tradeoff w/ preision and area under curve with using lasso model.

Task 1.6

For the model you decided in 1.5, find and report the most important 10 predictors of sentiment and th0eir coefficients. Briefly comment which variables seem to be the most important predictors.

#install.packages('vip')

require(vip)
## Loading required package: vip
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
vip(mod_2, 
    num_features = 10, 
    geom = "point") + 
  theme_bw()

In this model, it seems the month, date, and hour variables are all strong predictors, as well as a few word embeddings.

Task 1.7.

Below are the two tweets I picked from my timeline. Use the model you decided in Task 1.5 to predict a probability that the sentiment being positive for these tweets. You are welcome to extract the word embeddings for these tweets by yourself (model: roberta-base, layer=12). Assume that all these tweets are posted on Saturday, May 1, 2021 at 12pm. For convenience, you can also download the dataset from the link below in case you have trouble in extracting the word embeddings.

tweet1  <- "You are not getting displaced you decide to sell your $800k townhome because a 12-story apartment goes up next door, and then can't find another $800k townhome in Arlington."

tweet2  <- "One cold morning and I'm in holiday mode. Bought gifts for students."

new_tweets <- read.csv('https://raw.githubusercontent.com/uo-datasci-specialization/c4-ml-fall-2021/main/content/post/hw2/data/toy_tweet_embeddings.csv',header=TRUE)



pred_tweet <- predict(mod_2, new_tweets, type = "response")

cbind(new_tweets$tweet, pred_tweet)

Predicted probability positive Tweet 1 = .47 Predicted probability positive Tweet 1 = .28

Task 1.8.

Let’s do an experiment and test whether or not the model is biased against certain groups when detecting sentiment of a given text. Below you will find 10 hypothetical tweets with an identical structure. The only thing that changes from tweet to tweet is the subject.

bias_check <- read.csv('https://raw.githubusercontent.com/uo-datasci-specialization/c4-ml-fall-2021/main/content/post/hw2/data/bias_check_tweet_embeddings.csv',header=TRUE)


pred_tweet2 <- predict(mod_2, bias_check, type = "response")

cbind(bias_check$tweet, pred_tweet2) %>% arrange(Negative)

It seems like algorithm is most biased towards Muslim people, and least biased towards Atheist people. This statement is least likely to have a negative sentiment when Muslim people are the group, and most likely to have a negative sentiment when Atheist is subject.

Part 2: Linear Regression

Load and prep data

# Load the following packages needed for modeling in this assignment

  require(caret)
  require(recipes)
  require(finalfit)
  require(glmnet)


# Import the oregon dataset

#oregon <- read.csv('https://raw.githubusercontent.com/uo-datasci-specialization/c4-ml-fall-2021/main/content/post/hw2/data/hw1_oregon_final.csv',header=TRUE)

oregon <- read.csv("Oregon_Data.csv")

# Recipe for the oregon dataset

  outcome <- 'score'
  
  id      <- 'id'

  categorical <- c('sex','ethnic_cd','tst_bnch','migrant_ed_fg','ind_ed_fg',
                   'sp_ed_fg','tag_ed_fg','econ_dsvntg','stay_in_dist',
                   'stay_in_schl','dist_sped','trgt_assist_fg',
                   'ayp_dist_partic','ayp_schl_partic','ayp_dist_prfrm',
                   'ayp_schl_prfrm','rc_dist_partic','rc_schl_partic',
                   'rc_dist_prfrm','rc_schl_prfrm','grp_rpt_dist_partic',
                   'grp_rpt_schl_partic','grp_rpt_dist_prfrm',
                   'grp_rpt_schl_prfrm')

  numeric <- c('enrl_grd')

  cyclic <- c('date','month')


blueprint_oregon <- recipe(x     = oregon,
                    vars  = c(outcome,categorical,numeric,cyclic),
                    roles = c('outcome',rep('predictor',27))) %>%
  step_indicate_na(all_of(categorical),all_of(numeric)) %>%
  step_zv(all_numeric()) %>%
  step_impute_mean(all_of(numeric)) %>%
  step_impute_mode(all_of(categorical)) %>%
  step_harmonic('date',frequency=1,cycle_size=31,role='predictor') %>%
  step_harmonic('month',frequency=1,cycle_size=12,role='predictor') %>%
  step_ns('enrl_grd',deg_free=3) %>%
  step_normalize(c(paste0(numeric,'_ns_1'),paste0(numeric,'_ns_2'),paste0(numeric,'_ns_3'))) %>%
  step_normalize(c("date_sin_1","date_cos_1","month_sin_1","month_cos_1")) %>%
  step_dummy(all_of(categorical),one_hot=TRUE) %>%
  step_rm(c('date','month'))
    
  print(blueprint_oregon %>% prep() %>% summary)
## # A tibble: 73 x 4
##    variable              type    role      source  
##    <chr>                 <chr>   <chr>     <chr>   
##  1 score                 numeric outcome   original
##  2 na_ind_ind_ed_fg      numeric predictor derived 
##  3 na_ind_sp_ed_fg       numeric predictor derived 
##  4 na_ind_tag_ed_fg      numeric predictor derived 
##  5 na_ind_econ_dsvntg    numeric predictor derived 
##  6 na_ind_stay_in_dist   numeric predictor derived 
##  7 na_ind_stay_in_schl   numeric predictor derived 
##  8 na_ind_dist_sped      numeric predictor derived 
##  9 na_ind_trgt_assist_fg numeric predictor derived 
## 10 date_sin_1            numeric predictor derived 
## # ... with 63 more rows
  blueprint_oregon_ridge <- recipe(x     = oregon,
                    vars  = c(outcome,categorical,numeric,cyclic),
                    roles = c('outcome',rep('predictor',27))) %>%
  step_indicate_na(all_of(categorical),all_of(numeric)) %>%
  step_zv(all_numeric()) %>%
  step_impute_mean(all_of(numeric)) %>%
  step_impute_mode(all_of(categorical)) %>%
  step_harmonic('date',frequency=1,cycle_size=31,role='predictor') %>%
  step_harmonic('month',frequency=1,cycle_size=12,role='predictor') %>%
  step_ns('enrl_grd',deg_free=3) %>%
  step_normalize(c(paste0(numeric,'_ns_1'),paste0(numeric,'_ns_2'),paste0(numeric,'_ns_3'))) %>%
  step_normalize(c("date_sin_1","date_cos_1","month_sin_1","month_cos_1")) %>%
  step_dummy(all_of(categorical),one_hot=TRUE) %>%
  step_rm(c('date','month')) %>%
  step_normalize(everything())

Task 2.1.

Check the dataset for missingness. If there is any variable with more than 75% missingness, remove these variables.

ff_glimpse(oregon)
## $Continuous
##             label var_type      n missing_n missing_percent     mean      sd
## X               X    <int> 189426         0             0.0  94713.5 54682.7
## id             id    <int> 189426         0             0.0 126190.5 72879.8
## enrl_grd enrl_grd    <int> 189426         0             0.0      5.5     1.7
## score       score    <int> 189426         0             0.0   2499.0   115.8
## month       month    <int> 189426         0             0.0      5.0     0.4
## date         date    <int> 189426         0             0.0     17.4     8.4
##             min quartile_25   median quartile_75      max
## X           1.0     47357.2  94713.5    142069.8 189426.0
## id          1.0     63135.2 126095.5    189261.8 252568.0
## enrl_grd    3.0         4.0      5.0         7.0      8.0
## score    1601.0      2421.0   2498.0      2576.0   3550.0
## month       2.0         5.0      5.0         5.0      6.0
## date        1.0        10.0     18.0        24.0     31.0
## 
## $Categorical
##                                   label var_type      n missing_n
## sex                                 sex    <chr> 189426         0
## ethnic_cd                     ethnic_cd    <chr> 189426         0
## tst_bnch                       tst_bnch    <chr> 189426         0
## migrant_ed_fg             migrant_ed_fg    <chr> 189426         0
## ind_ed_fg                     ind_ed_fg    <chr> 189365        61
## sp_ed_fg                       sp_ed_fg    <chr> 189365        61
## tag_ed_fg                     tag_ed_fg    <chr> 188963       463
## econ_dsvntg                 econ_dsvntg    <chr> 188895       531
## stay_in_dist               stay_in_dist    <chr> 188963       463
## stay_in_schl               stay_in_schl    <chr> 188963       463
## dist_sped                     dist_sped    <chr> 188963       463
## trgt_assist_fg           trgt_assist_fg    <chr> 188956       470
## ayp_dist_partic         ayp_dist_partic    <chr> 189426         0
## ayp_schl_partic         ayp_schl_partic    <chr> 189426         0
## ayp_dist_prfrm           ayp_dist_prfrm    <chr> 189426         0
## ayp_schl_prfrm           ayp_schl_prfrm    <chr> 189426         0
## rc_dist_partic           rc_dist_partic    <chr> 189426         0
## rc_schl_partic           rc_schl_partic    <chr> 189426         0
## rc_dist_prfrm             rc_dist_prfrm    <chr> 189426         0
## rc_schl_prfrm             rc_schl_prfrm    <chr> 189426         0
## grp_rpt_dist_partic grp_rpt_dist_partic    <chr> 189426         0
## grp_rpt_schl_partic grp_rpt_schl_partic    <chr> 189426         0
## grp_rpt_dist_prfrm   grp_rpt_dist_prfrm    <chr> 189426         0
## grp_rpt_schl_prfrm   grp_rpt_schl_prfrm    <chr> 189426         0
##                     missing_percent levels_n levels levels_count levels_percent
## sex                             0.0        2      -            -              -
## ethnic_cd                       0.0        7      -            -              -
## tst_bnch                        0.0        6      -            -              -
## migrant_ed_fg                   0.0        2      -            -              -
## ind_ed_fg                       0.0        2      -            -              -
## sp_ed_fg                        0.0        2      -            -              -
## tag_ed_fg                       0.2        2      -            -              -
## econ_dsvntg                     0.3        2      -            -              -
## stay_in_dist                    0.2        2      -            -              -
## stay_in_schl                    0.2        2      -            -              -
## dist_sped                       0.2        2      -            -              -
## trgt_assist_fg                  0.2        2      -            -              -
## ayp_dist_partic                 0.0        2      -            -              -
## ayp_schl_partic                 0.0        2      -            -              -
## ayp_dist_prfrm                  0.0        2      -            -              -
## ayp_schl_prfrm                  0.0        2      -            -              -
## rc_dist_partic                  0.0        2      -            -              -
## rc_schl_partic                  0.0        2      -            -              -
## rc_dist_prfrm                   0.0        2      -            -              -
## rc_schl_prfrm                   0.0        2      -            -              -
## grp_rpt_dist_partic             0.0        2      -            -              -
## grp_rpt_schl_partic             0.0        2      -            -              -
## grp_rpt_dist_prfrm              0.0        2      -            -              -
## grp_rpt_schl_prfrm              0.0        2      -            -              -

Task 2.2.

Split the original data into two subsets: training and test. Let the training data have the 80% of cases and the test data have the 20% of the cases.

set.seed(11172021)  # for reproducibility
  
loc      <- sample(1:nrow(oregon), round(nrow(oregon) * 0.8))
oregon_train  <- oregon[loc, ]
oregon_test  <- oregon[-loc, ]

Task 2.3.

Use the caret::train() function to train a model with 10-fold cross-validation to predict the scores using linear regression without any regularization. Evaluate the performance of the model on both training and test datasets. Evaluate and report RMSE, R-square, and MAE for both training and test datasets. Is there any evidence of overfitting?

#making a crossfold training function
crossfold <- function(training_data, folds){

  #randomly shuffle data
  # Randomly shuffle the data
training_data = training_data[sample(nrow(training_data)),]
  
  
    # Create 10 folds with equal size

      N_folds = cut(seq(1,nrow(training_data)),breaks= folds,labels=FALSE)
  
    # Create the list for each fold 
      
      my.indices <- vector('list',folds)
      for(i in 1:folds){
        my.indices[[i]] <- which(N_folds!=i)
      }
      
cv <- trainControl(method = "cv",
                   index  = my.indices)

cv
}
cross_or <- crossfold(oregon_train, 10)

or_mod_1 <- caret::train(blueprint_oregon, 
                          data      = oregon_train, 
                          method    = "lm", 
                          trControl = cross_or)
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
or_mod_1
## Linear Regression 
## 
## 151541 samples
##     29 predictor
## 
## Recipe steps: indicate_na, zv, impute_mean, impute_mode, harmonic,
##  harmonic, ns, normalize, normalize, dummy, rm 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 136386, 136387, 136387, 136387, 136387, 136387, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   89.81761  0.4009267  69.57297
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
predicted_test <- predict(or_mod_1, oregon_test)
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
RMSE_test <- RMSE(predicted_test, oregon_test$score)
cat("Test RMSEA is", RMSE_test, "\n")
## Test RMSEA is 88.96643
rsq_test <- cor(predicted_test, oregon_test$score)^2
cat("Test R-squared is", rsq_test, "\n")
## Test R-squared is 0.4013898
mae_test <- MAE(predicted_test, oregon_test$score)
cat("Test MAE is", mae_test, "\n")
## Test MAE is 69.18454
mod_1_stats <- c("Unregularized Regression",  RMSE_test, rsq_test, mae_test)

I recieved an error: rank deficient model. Performance statistics between train and test datasets appear quite similar. Interestingly, less error variance for test data. So it seems, no, model is not overfit to training dataset.

Task 2.4.

Use the caret::train() function to train a model with 10-fold cross-validation to predict the scores using ridge regression. Try different values of lambda to decide optimal value. Evaluate the performance of the model on the test dataset, and report RMSE, R-square, and MAE. Does ridge regression provide any improvement over linear regression with no regularization?

ridge_grid <- data.frame(alpha = 0, lambda = seq(.003, .04, .0001))


cross_or <- crossfold(oregon_train, 10)

or_mod_2 <- caret::train(blueprint_oregon_ridge, 
                          data      = oregon_train, 
                          method    = "glmnet", 
                          trControl = cross_or, 
                          tuneGrid = ridge_grid)

or_mod_2
## glmnet 
## 
## 151541 samples
##     29 predictor
## 
## Recipe steps: indicate_na, zv, impute_mean, impute_mode, harmonic,
##  harmonic, ns, normalize, normalize, dummy, rm, normalize 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 136386, 136387, 136387, 136387, 136387, 136387, ... 
## Resampling results across tuning parameters:
## 
##   lambda  RMSE       Rsquared   MAE      
##   0.0030  0.7731359  0.4023295  0.5996013
##   0.0031  0.7731359  0.4023295  0.5996013
##   0.0032  0.7731359  0.4023295  0.5996013
##   0.0033  0.7731359  0.4023295  0.5996013
##   0.0034  0.7731359  0.4023295  0.5996013
##   0.0035  0.7731359  0.4023295  0.5996013
##   0.0036  0.7731359  0.4023295  0.5996013
##   0.0037  0.7731359  0.4023295  0.5996013
##   0.0038  0.7731359  0.4023295  0.5996013
##   0.0039  0.7731359  0.4023295  0.5996013
##   0.0040  0.7731359  0.4023295  0.5996013
##   0.0041  0.7731359  0.4023295  0.5996013
##   0.0042  0.7731359  0.4023295  0.5996013
##   0.0043  0.7731359  0.4023295  0.5996013
##   0.0044  0.7731359  0.4023295  0.5996013
##   0.0045  0.7731359  0.4023295  0.5996013
##   0.0046  0.7731359  0.4023295  0.5996013
##   0.0047  0.7731359  0.4023295  0.5996013
##   0.0048  0.7731359  0.4023295  0.5996013
##   0.0049  0.7731359  0.4023295  0.5996013
##   0.0050  0.7731359  0.4023295  0.5996013
##   0.0051  0.7731359  0.4023295  0.5996013
##   0.0052  0.7731359  0.4023295  0.5996013
##   0.0053  0.7731359  0.4023295  0.5996013
##   0.0054  0.7731359  0.4023295  0.5996013
##   0.0055  0.7731359  0.4023295  0.5996013
##   0.0056  0.7731359  0.4023295  0.5996013
##   0.0057  0.7731359  0.4023295  0.5996013
##   0.0058  0.7731359  0.4023295  0.5996013
##   0.0059  0.7731359  0.4023295  0.5996013
##   0.0060  0.7731359  0.4023295  0.5996013
##   0.0061  0.7731359  0.4023295  0.5996013
##   0.0062  0.7731359  0.4023295  0.5996013
##   0.0063  0.7731359  0.4023295  0.5996013
##   0.0064  0.7731359  0.4023295  0.5996013
##   0.0065  0.7731359  0.4023295  0.5996013
##   0.0066  0.7731359  0.4023295  0.5996013
##   0.0067  0.7731359  0.4023295  0.5996013
##   0.0068  0.7731359  0.4023295  0.5996013
##   0.0069  0.7731359  0.4023295  0.5996013
##   0.0070  0.7731359  0.4023295  0.5996013
##   0.0071  0.7731359  0.4023295  0.5996013
##   0.0072  0.7731359  0.4023295  0.5996013
##   0.0073  0.7731359  0.4023295  0.5996013
##   0.0074  0.7731359  0.4023295  0.5996013
##   0.0075  0.7731359  0.4023295  0.5996013
##   0.0076  0.7731359  0.4023295  0.5996013
##   0.0077  0.7731359  0.4023295  0.5996013
##   0.0078  0.7731359  0.4023295  0.5996013
##   0.0079  0.7731359  0.4023295  0.5996013
##   0.0080  0.7731359  0.4023295  0.5996013
##   0.0081  0.7731359  0.4023295  0.5996013
##   0.0082  0.7731359  0.4023295  0.5996013
##   0.0083  0.7731359  0.4023295  0.5996013
##   0.0084  0.7731359  0.4023295  0.5996013
##   0.0085  0.7731359  0.4023295  0.5996013
##   0.0086  0.7731359  0.4023295  0.5996013
##   0.0087  0.7731359  0.4023295  0.5996013
##   0.0088  0.7731359  0.4023295  0.5996013
##   0.0089  0.7731359  0.4023295  0.5996013
##   0.0090  0.7731359  0.4023295  0.5996013
##   0.0091  0.7731359  0.4023295  0.5996013
##   0.0092  0.7731359  0.4023295  0.5996013
##   0.0093  0.7731359  0.4023295  0.5996013
##   0.0094  0.7731359  0.4023295  0.5996013
##   0.0095  0.7731359  0.4023295  0.5996013
##   0.0096  0.7731359  0.4023295  0.5996013
##   0.0097  0.7731359  0.4023295  0.5996013
##   0.0098  0.7731359  0.4023295  0.5996013
##   0.0099  0.7731359  0.4023295  0.5996013
##   0.0100  0.7731359  0.4023295  0.5996013
##   0.0101  0.7731359  0.4023295  0.5996013
##   0.0102  0.7731359  0.4023295  0.5996013
##   0.0103  0.7731359  0.4023295  0.5996013
##   0.0104  0.7731359  0.4023295  0.5996013
##   0.0105  0.7731359  0.4023295  0.5996013
##   0.0106  0.7731359  0.4023295  0.5996013
##   0.0107  0.7731359  0.4023295  0.5996013
##   0.0108  0.7731359  0.4023295  0.5996013
##   0.0109  0.7731359  0.4023295  0.5996013
##   0.0110  0.7731359  0.4023295  0.5996013
##   0.0111  0.7731359  0.4023295  0.5996013
##   0.0112  0.7731359  0.4023295  0.5996013
##   0.0113  0.7731359  0.4023295  0.5996013
##   0.0114  0.7731359  0.4023295  0.5996013
##   0.0115  0.7731359  0.4023295  0.5996013
##   0.0116  0.7731359  0.4023295  0.5996013
##   0.0117  0.7731359  0.4023295  0.5996013
##   0.0118  0.7731359  0.4023295  0.5996013
##   0.0119  0.7731359  0.4023295  0.5996013
##   0.0120  0.7731359  0.4023295  0.5996013
##   0.0121  0.7731359  0.4023295  0.5996013
##   0.0122  0.7731359  0.4023295  0.5996013
##   0.0123  0.7731359  0.4023295  0.5996013
##   0.0124  0.7731359  0.4023295  0.5996013
##   0.0125  0.7731359  0.4023295  0.5996013
##   0.0126  0.7731359  0.4023295  0.5996013
##   0.0127  0.7731359  0.4023295  0.5996013
##   0.0128  0.7731359  0.4023295  0.5996013
##   0.0129  0.7731359  0.4023295  0.5996013
##   0.0130  0.7731359  0.4023295  0.5996013
##   0.0131  0.7731359  0.4023295  0.5996013
##   0.0132  0.7731359  0.4023295  0.5996013
##   0.0133  0.7731359  0.4023295  0.5996013
##   0.0134  0.7731359  0.4023295  0.5996013
##   0.0135  0.7731359  0.4023295  0.5996013
##   0.0136  0.7731359  0.4023295  0.5996013
##   0.0137  0.7731359  0.4023295  0.5996013
##   0.0138  0.7731359  0.4023295  0.5996013
##   0.0139  0.7731359  0.4023295  0.5996013
##   0.0140  0.7731359  0.4023295  0.5996013
##   0.0141  0.7731359  0.4023295  0.5996013
##   0.0142  0.7731359  0.4023295  0.5996013
##   0.0143  0.7731359  0.4023295  0.5996013
##   0.0144  0.7731359  0.4023295  0.5996013
##   0.0145  0.7731359  0.4023295  0.5996013
##   0.0146  0.7731359  0.4023295  0.5996013
##   0.0147  0.7731359  0.4023295  0.5996013
##   0.0148  0.7731359  0.4023295  0.5996013
##   0.0149  0.7731359  0.4023295  0.5996013
##   0.0150  0.7731359  0.4023295  0.5996013
##   0.0151  0.7731359  0.4023295  0.5996013
##   0.0152  0.7731359  0.4023295  0.5996013
##   0.0153  0.7731359  0.4023295  0.5996013
##   0.0154  0.7731359  0.4023295  0.5996013
##   0.0155  0.7731359  0.4023295  0.5996013
##   0.0156  0.7731359  0.4023295  0.5996013
##   0.0157  0.7731359  0.4023295  0.5996013
##   0.0158  0.7731359  0.4023295  0.5996013
##   0.0159  0.7731359  0.4023295  0.5996013
##   0.0160  0.7731359  0.4023295  0.5996013
##   0.0161  0.7731359  0.4023295  0.5996013
##   0.0162  0.7731359  0.4023295  0.5996013
##   0.0163  0.7731359  0.4023295  0.5996013
##   0.0164  0.7731359  0.4023295  0.5996013
##   0.0165  0.7731359  0.4023295  0.5996013
##   0.0166  0.7731359  0.4023295  0.5996013
##   0.0167  0.7731359  0.4023295  0.5996013
##   0.0168  0.7731359  0.4023295  0.5996013
##   0.0169  0.7731359  0.4023295  0.5996013
##   0.0170  0.7731359  0.4023295  0.5996013
##   0.0171  0.7731359  0.4023295  0.5996013
##   0.0172  0.7731359  0.4023295  0.5996013
##   0.0173  0.7731359  0.4023295  0.5996013
##   0.0174  0.7731359  0.4023295  0.5996013
##   0.0175  0.7731359  0.4023295  0.5996013
##   0.0176  0.7731359  0.4023295  0.5996013
##   0.0177  0.7731359  0.4023295  0.5996013
##   0.0178  0.7731359  0.4023295  0.5996013
##   0.0179  0.7731359  0.4023295  0.5996013
##   0.0180  0.7731359  0.4023295  0.5996013
##   0.0181  0.7731359  0.4023295  0.5996013
##   0.0182  0.7731359  0.4023295  0.5996013
##   0.0183  0.7731359  0.4023295  0.5996013
##   0.0184  0.7731359  0.4023295  0.5996013
##   0.0185  0.7731359  0.4023295  0.5996013
##   0.0186  0.7731359  0.4023295  0.5996013
##   0.0187  0.7731359  0.4023295  0.5996013
##   0.0188  0.7731359  0.4023295  0.5996013
##   0.0189  0.7731359  0.4023295  0.5996013
##   0.0190  0.7731359  0.4023295  0.5996013
##   0.0191  0.7731359  0.4023295  0.5996013
##   0.0192  0.7731359  0.4023295  0.5996013
##   0.0193  0.7731359  0.4023295  0.5996013
##   0.0194  0.7731359  0.4023295  0.5996013
##   0.0195  0.7731359  0.4023295  0.5996013
##   0.0196  0.7731359  0.4023295  0.5996013
##   0.0197  0.7731359  0.4023295  0.5996013
##   0.0198  0.7731359  0.4023295  0.5996013
##   0.0199  0.7731359  0.4023295  0.5996013
##   0.0200  0.7731359  0.4023295  0.5996013
##   0.0201  0.7731359  0.4023295  0.5996013
##   0.0202  0.7731359  0.4023295  0.5996013
##   0.0203  0.7731359  0.4023295  0.5996013
##   0.0204  0.7731359  0.4023295  0.5996013
##   0.0205  0.7731359  0.4023295  0.5996013
##   0.0206  0.7731359  0.4023295  0.5996013
##   0.0207  0.7731359  0.4023295  0.5996013
##   0.0208  0.7731359  0.4023295  0.5996013
##   0.0209  0.7731359  0.4023295  0.5996013
##   0.0210  0.7731359  0.4023295  0.5996013
##   0.0211  0.7731359  0.4023295  0.5996013
##   0.0212  0.7731359  0.4023295  0.5996013
##   0.0213  0.7731359  0.4023295  0.5996013
##   0.0214  0.7731359  0.4023295  0.5996013
##   0.0215  0.7731359  0.4023295  0.5996013
##   0.0216  0.7731359  0.4023295  0.5996013
##   0.0217  0.7731359  0.4023295  0.5996013
##   0.0218  0.7731359  0.4023295  0.5996013
##   0.0219  0.7731359  0.4023295  0.5996013
##   0.0220  0.7731359  0.4023295  0.5996013
##   0.0221  0.7731359  0.4023295  0.5996013
##   0.0222  0.7731359  0.4023295  0.5996013
##   0.0223  0.7731359  0.4023295  0.5996013
##   0.0224  0.7731359  0.4023295  0.5996013
##   0.0225  0.7731359  0.4023295  0.5996013
##   0.0226  0.7731359  0.4023295  0.5996013
##   0.0227  0.7731359  0.4023295  0.5996013
##   0.0228  0.7731359  0.4023295  0.5996013
##   0.0229  0.7731359  0.4023295  0.5996013
##   0.0230  0.7731359  0.4023295  0.5996013
##   0.0231  0.7731359  0.4023295  0.5996013
##   0.0232  0.7731359  0.4023295  0.5996013
##   0.0233  0.7731359  0.4023295  0.5996013
##   0.0234  0.7731359  0.4023295  0.5996013
##   0.0235  0.7731359  0.4023295  0.5996013
##   0.0236  0.7731359  0.4023295  0.5996013
##   0.0237  0.7731359  0.4023295  0.5996013
##   0.0238  0.7731359  0.4023295  0.5996013
##   0.0239  0.7731359  0.4023295  0.5996013
##   0.0240  0.7731359  0.4023295  0.5996013
##   0.0241  0.7731359  0.4023295  0.5996013
##   0.0242  0.7731359  0.4023295  0.5996013
##   0.0243  0.7731359  0.4023295  0.5996013
##   0.0244  0.7731359  0.4023295  0.5996013
##   0.0245  0.7731359  0.4023295  0.5996013
##   0.0246  0.7731359  0.4023295  0.5996013
##   0.0247  0.7731359  0.4023295  0.5996013
##   0.0248  0.7731359  0.4023295  0.5996013
##   0.0249  0.7731359  0.4023295  0.5996013
##   0.0250  0.7731359  0.4023295  0.5996013
##   0.0251  0.7731359  0.4023295  0.5996013
##   0.0252  0.7731359  0.4023295  0.5996013
##   0.0253  0.7731359  0.4023295  0.5996013
##   0.0254  0.7731359  0.4023295  0.5996013
##   0.0255  0.7731359  0.4023295  0.5996013
##   0.0256  0.7731359  0.4023295  0.5996013
##   0.0257  0.7731359  0.4023295  0.5996013
##   0.0258  0.7731359  0.4023295  0.5996013
##   0.0259  0.7731359  0.4023295  0.5996013
##   0.0260  0.7731359  0.4023295  0.5996013
##   0.0261  0.7731359  0.4023295  0.5996013
##   0.0262  0.7731359  0.4023295  0.5996013
##   0.0263  0.7731359  0.4023295  0.5996013
##   0.0264  0.7731359  0.4023295  0.5996013
##   0.0265  0.7731359  0.4023295  0.5996013
##   0.0266  0.7731359  0.4023295  0.5996013
##   0.0267  0.7731359  0.4023295  0.5996013
##   0.0268  0.7731359  0.4023295  0.5996013
##   0.0269  0.7731359  0.4023295  0.5996013
##   0.0270  0.7731359  0.4023295  0.5996013
##   0.0271  0.7731359  0.4023295  0.5996013
##   0.0272  0.7731359  0.4023295  0.5996013
##   0.0273  0.7731359  0.4023295  0.5996013
##   0.0274  0.7731359  0.4023295  0.5996013
##   0.0275  0.7731359  0.4023295  0.5996013
##   0.0276  0.7731359  0.4023295  0.5996013
##   0.0277  0.7731359  0.4023295  0.5996013
##   0.0278  0.7731359  0.4023295  0.5996013
##   0.0279  0.7731359  0.4023295  0.5996013
##   0.0280  0.7731359  0.4023295  0.5996013
##   0.0281  0.7731359  0.4023295  0.5996013
##   0.0282  0.7731359  0.4023295  0.5996013
##   0.0283  0.7731359  0.4023295  0.5996013
##   0.0284  0.7731359  0.4023295  0.5996013
##   0.0285  0.7731359  0.4023295  0.5996013
##   0.0286  0.7731359  0.4023295  0.5996013
##   0.0287  0.7731359  0.4023295  0.5996013
##   0.0288  0.7731359  0.4023295  0.5996013
##   0.0289  0.7731359  0.4023295  0.5996013
##   0.0290  0.7731359  0.4023295  0.5996013
##   0.0291  0.7731359  0.4023295  0.5996013
##   0.0292  0.7731359  0.4023295  0.5996013
##   0.0293  0.7731359  0.4023295  0.5996013
##   0.0294  0.7731359  0.4023295  0.5996013
##   0.0295  0.7731359  0.4023295  0.5996013
##   0.0296  0.7731359  0.4023295  0.5996013
##   0.0297  0.7731359  0.4023295  0.5996013
##   0.0298  0.7731359  0.4023295  0.5996013
##   0.0299  0.7731359  0.4023295  0.5996013
##   0.0300  0.7731359  0.4023295  0.5996013
##   0.0301  0.7731359  0.4023295  0.5996013
##   0.0302  0.7731359  0.4023295  0.5996013
##   0.0303  0.7731359  0.4023295  0.5996013
##   0.0304  0.7731359  0.4023295  0.5996013
##   0.0305  0.7731359  0.4023295  0.5996013
##   0.0306  0.7731359  0.4023295  0.5996013
##   0.0307  0.7731359  0.4023295  0.5996013
##   0.0308  0.7731359  0.4023295  0.5996013
##   0.0309  0.7731359  0.4023295  0.5996013
##   0.0310  0.7731359  0.4023295  0.5996013
##   0.0311  0.7731359  0.4023295  0.5996013
##   0.0312  0.7731359  0.4023295  0.5996013
##   0.0313  0.7731359  0.4023295  0.5996013
##   0.0314  0.7731359  0.4023295  0.5996013
##   0.0315  0.7731359  0.4023295  0.5996013
##   0.0316  0.7731359  0.4023295  0.5996013
##   0.0317  0.7731359  0.4023295  0.5996013
##   0.0318  0.7731359  0.4023295  0.5996013
##   0.0319  0.7731359  0.4023295  0.5996013
##   0.0320  0.7731359  0.4023295  0.5996013
##   0.0321  0.7731359  0.4023295  0.5996013
##   0.0322  0.7731359  0.4023295  0.5996013
##   0.0323  0.7731359  0.4023295  0.5996013
##   0.0324  0.7731359  0.4023295  0.5996013
##   0.0325  0.7731359  0.4023295  0.5996013
##   0.0326  0.7731359  0.4023295  0.5996013
##   0.0327  0.7731359  0.4023295  0.5996013
##   0.0328  0.7731359  0.4023295  0.5996013
##   0.0329  0.7731359  0.4023295  0.5996013
##   0.0330  0.7731359  0.4023295  0.5996013
##   0.0331  0.7731359  0.4023295  0.5996013
##   0.0332  0.7731359  0.4023295  0.5996013
##   0.0333  0.7731359  0.4023295  0.5996013
##   0.0334  0.7731360  0.4023295  0.5996014
##   0.0335  0.7731363  0.4023295  0.5996018
##   0.0336  0.7731366  0.4023294  0.5996023
##   0.0337  0.7731369  0.4023294  0.5996028
##   0.0338  0.7731372  0.4023293  0.5996033
##   0.0339  0.7731375  0.4023293  0.5996037
##   0.0340  0.7731378  0.4023292  0.5996042
##   0.0341  0.7731381  0.4023292  0.5996047
##   0.0342  0.7731384  0.4023291  0.5996052
##   0.0343  0.7731387  0.4023291  0.5996057
##   0.0344  0.7731390  0.4023290  0.5996062
##   0.0345  0.7731393  0.4023290  0.5996067
##   0.0346  0.7731396  0.4023289  0.5996072
##   0.0347  0.7731399  0.4023289  0.5996077
##   0.0348  0.7731402  0.4023288  0.5996082
##   0.0349  0.7731405  0.4023288  0.5996087
##   0.0350  0.7731408  0.4023287  0.5996092
##   0.0351  0.7731411  0.4023287  0.5996097
##   0.0352  0.7731415  0.4023286  0.5996102
##   0.0353  0.7731418  0.4023286  0.5996107
##   0.0354  0.7731421  0.4023285  0.5996112
##   0.0355  0.7731424  0.4023285  0.5996117
##   0.0356  0.7731427  0.4023284  0.5996122
##   0.0357  0.7731430  0.4023283  0.5996127
##   0.0358  0.7731433  0.4023283  0.5996132
##   0.0359  0.7731437  0.4023282  0.5996137
##   0.0360  0.7731440  0.4023282  0.5996142
##   0.0361  0.7731443  0.4023281  0.5996147
##   0.0362  0.7731446  0.4023281  0.5996152
##   0.0363  0.7731449  0.4023280  0.5996157
##   0.0364  0.7731453  0.4023280  0.5996162
##   0.0365  0.7731456  0.4023279  0.5996167
##   0.0366  0.7731459  0.4023279  0.5996172
##   0.0367  0.7731462  0.4023278  0.5996177
##   0.0368  0.7731465  0.4023278  0.5996182
##   0.0369  0.7731469  0.4023277  0.5996188
##   0.0370  0.7731472  0.4023276  0.5996193
##   0.0371  0.7731475  0.4023276  0.5996198
##   0.0372  0.7731478  0.4023275  0.5996203
##   0.0373  0.7731482  0.4023275  0.5996208
##   0.0374  0.7731485  0.4023274  0.5996213
##   0.0375  0.7731488  0.4023273  0.5996218
##   0.0376  0.7731492  0.4023273  0.5996223
##   0.0377  0.7731495  0.4023272  0.5996228
##   0.0378  0.7731498  0.4023272  0.5996234
##   0.0379  0.7731501  0.4023271  0.5996239
##   0.0380  0.7731505  0.4023271  0.5996244
##   0.0381  0.7731508  0.4023270  0.5996249
##   0.0382  0.7731511  0.4023269  0.5996254
##   0.0383  0.7731515  0.4023269  0.5996259
##   0.0384  0.7731518  0.4023268  0.5996265
##   0.0385  0.7731522  0.4023268  0.5996270
##   0.0386  0.7731525  0.4023267  0.5996275
##   0.0387  0.7731528  0.4023266  0.5996280
##   0.0388  0.7731532  0.4023266  0.5996285
##   0.0389  0.7731535  0.4023265  0.5996291
##   0.0390  0.7731539  0.4023264  0.5996296
##   0.0391  0.7731542  0.4023264  0.5996301
##   0.0392  0.7731545  0.4023263  0.5996306
##   0.0393  0.7731549  0.4023263  0.5996311
##   0.0394  0.7731552  0.4023262  0.5996317
##   0.0395  0.7731556  0.4023261  0.5996322
##   0.0396  0.7731559  0.4023261  0.5996327
##   0.0397  0.7731563  0.4023260  0.5996332
##   0.0398  0.7731566  0.4023260  0.5996338
##   0.0399  0.7731570  0.4023259  0.5996343
##   0.0400  0.7731573  0.4023258  0.5996348
## 
## Tuning parameter 'alpha' was held constant at a value of 0
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0 and lambda = 0.0333.
predicted_test <- predict(or_mod_2, oregon_test)


RMSE_test <- RMSE(predicted_test, oregon_test$score)
cat("Test RMSEA is", RMSE_test, "\n")
## Test RMSEA is 2501.218
rsq_test <- cor(predicted_test, oregon_test$score)^2
cat("Test R-squared is", rsq_test, "\n")
## Test R-squared is 0.4012969
mae_test <- MAE(predicted_test, oregon_test$score)
cat("Test MAE is", mae_test, "\n")
## Test MAE is 2498.592
mod_2_stats <- c("Ridge Regression", RMSE_test, rsq_test, mae_test)

Not that I can tell! Error increased from non-regularized model test; also over-fitting data (smaller error values for crossfold training set) I tried following hyperparameter tunings- 1) .01:3 by .1 (Used .01 for parameter) 2) .001:.01 by .0001 (Used .01) 3) .005: .05 by .0005 (Used .033) 4) .003 to .04 by .0001 (Used .0333) RMSEA remained at 2501.218 across models, so stopped testing

Task 2.5.

Use the caret::train() function to train a model with 10-fold cross-validation to predict the scores using lasso regression. Try different values of lambda to decide optimal value. Evaluate the performance of the model on the test dataset, and report RMSE, R-square, and MAE. Does lasso regression provide any improvement over linear regression with no regularization?

lasso_grid <- data.frame(alpha = 1, lambda = seq(0.00079, 0.00083, 0.0000001))

cross_or <- crossfold(oregon_train, 10)

or_mod_3 <- caret::train(blueprint_oregon_ridge, 
                          data      = oregon_train, 
                          method    = "glmnet", 
                          trControl = cross_or, 
                          tuneGrid = lasso_grid)

or_mod_3
## glmnet 
## 
## 151541 samples
##     29 predictor
## 
## Recipe steps: indicate_na, zv, impute_mean, impute_mode, harmonic,
##  harmonic, ns, normalize, normalize, dummy, rm, normalize 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 136386, 136387, 136387, 136387, 136387, 136387, ... 
## Resampling results across tuning parameters:
## 
##   lambda     RMSE       Rsquared   MAE      
##   0.0007900  0.7731122  0.4023048  0.5995311
##   0.0007901  0.7731122  0.4023048  0.5995311
##   0.0007902  0.7731122  0.4023048  0.5995311
##   0.0007903  0.7731122  0.4023048  0.5995311
##   0.0007904  0.7731122  0.4023048  0.5995311
##   0.0007905  0.7731122  0.4023048  0.5995311
##   0.0007906  0.7731122  0.4023048  0.5995311
##   0.0007907  0.7731122  0.4023048  0.5995311
##   0.0007908  0.7731122  0.4023048  0.5995311
##   0.0007909  0.7731121  0.4023048  0.5995311
##   0.0007910  0.7731121  0.4023048  0.5995311
##   0.0007911  0.7731121  0.4023048  0.5995311
##   0.0007912  0.7731121  0.4023048  0.5995311
##   0.0007913  0.7731121  0.4023048  0.5995311
##   0.0007914  0.7731121  0.4023048  0.5995311
##   0.0007915  0.7731121  0.4023048  0.5995311
##   0.0007916  0.7731121  0.4023048  0.5995311
##   0.0007917  0.7731121  0.4023048  0.5995311
##   0.0007918  0.7731121  0.4023048  0.5995311
##   0.0007919  0.7731121  0.4023048  0.5995311
##   0.0007920  0.7731121  0.4023048  0.5995311
##   0.0007921  0.7731121  0.4023048  0.5995311
##   0.0007922  0.7731121  0.4023048  0.5995311
##   0.0007923  0.7731121  0.4023048  0.5995311
##   0.0007924  0.7731121  0.4023049  0.5995311
##   0.0007925  0.7731121  0.4023049  0.5995311
##   0.0007926  0.7731121  0.4023049  0.5995311
##   0.0007927  0.7731121  0.4023049  0.5995312
##   0.0007928  0.7731121  0.4023049  0.5995312
##   0.0007929  0.7731121  0.4023049  0.5995312
##   0.0007930  0.7731121  0.4023049  0.5995312
##   0.0007931  0.7731121  0.4023049  0.5995312
##   0.0007932  0.7731121  0.4023049  0.5995312
##   0.0007933  0.7731121  0.4023049  0.5995312
##   0.0007934  0.7731121  0.4023049  0.5995312
##   0.0007935  0.7731121  0.4023049  0.5995312
##   0.0007936  0.7731121  0.4023049  0.5995312
##   0.0007937  0.7731121  0.4023049  0.5995312
##   0.0007938  0.7731121  0.4023049  0.5995312
##   0.0007939  0.7731121  0.4023049  0.5995312
##   0.0007940  0.7731121  0.4023049  0.5995312
##   0.0007941  0.7731121  0.4023049  0.5995312
##   0.0007942  0.7731121  0.4023049  0.5995312
##   0.0007943  0.7731122  0.4023049  0.5995312
##   0.0007944  0.7731122  0.4023049  0.5995312
##   0.0007945  0.7731122  0.4023049  0.5995312
##   0.0007946  0.7731122  0.4023049  0.5995312
##   0.0007947  0.7731122  0.4023049  0.5995312
##   0.0007948  0.7731122  0.4023049  0.5995312
##   0.0007949  0.7731122  0.4023049  0.5995312
##   0.0007950  0.7731122  0.4023049  0.5995312
##   0.0007951  0.7731122  0.4023049  0.5995312
##   0.0007952  0.7731122  0.4023049  0.5995312
##   0.0007953  0.7731122  0.4023049  0.5995312
##   0.0007954  0.7731122  0.4023048  0.5995312
##   0.0007955  0.7731122  0.4023048  0.5995312
##   0.0007956  0.7731122  0.4023048  0.5995312
##   0.0007957  0.7731122  0.4023048  0.5995312
##   0.0007958  0.7731122  0.4023048  0.5995312
##   0.0007959  0.7731122  0.4023048  0.5995313
##   0.0007960  0.7731122  0.4023048  0.5995313
##   0.0007961  0.7731122  0.4023048  0.5995313
##   0.0007962  0.7731122  0.4023048  0.5995313
##   0.0007963  0.7731122  0.4023048  0.5995313
##   0.0007964  0.7731122  0.4023048  0.5995313
##   0.0007965  0.7731122  0.4023048  0.5995313
##   0.0007966  0.7731122  0.4023048  0.5995313
##   0.0007967  0.7731122  0.4023048  0.5995313
##   0.0007968  0.7731122  0.4023048  0.5995313
##   0.0007969  0.7731122  0.4023048  0.5995313
##   0.0007970  0.7731122  0.4023048  0.5995313
##   0.0007971  0.7731122  0.4023048  0.5995313
##   0.0007972  0.7731122  0.4023048  0.5995313
##   0.0007973  0.7731122  0.4023048  0.5995313
##   0.0007974  0.7731122  0.4023048  0.5995313
##   0.0007975  0.7731122  0.4023048  0.5995313
##   0.0007976  0.7731122  0.4023048  0.5995313
##   0.0007977  0.7731122  0.4023048  0.5995313
##   0.0007978  0.7731122  0.4023048  0.5995313
##   0.0007979  0.7731122  0.4023048  0.5995313
##   0.0007980  0.7731122  0.4023048  0.5995313
##   0.0007981  0.7731122  0.4023048  0.5995313
##   0.0007982  0.7731122  0.4023048  0.5995313
##   0.0007983  0.7731122  0.4023048  0.5995313
##   0.0007984  0.7731122  0.4023048  0.5995313
##   0.0007985  0.7731122  0.4023048  0.5995313
##   0.0007986  0.7731122  0.4023048  0.5995313
##   0.0007987  0.7731122  0.4023048  0.5995313
##   0.0007988  0.7731122  0.4023048  0.5995313
##   0.0007989  0.7731122  0.4023048  0.5995313
##   0.0007990  0.7731122  0.4023048  0.5995314
##   0.0007991  0.7731122  0.4023048  0.5995314
##   0.0007992  0.7731122  0.4023048  0.5995314
##   0.0007993  0.7731122  0.4023048  0.5995314
##   0.0007994  0.7731122  0.4023048  0.5995314
##   0.0007995  0.7731122  0.4023048  0.5995314
##   0.0007996  0.7731122  0.4023048  0.5995314
##   0.0007997  0.7731122  0.4023048  0.5995314
##   0.0007998  0.7731122  0.4023048  0.5995314
##   0.0007999  0.7731122  0.4023048  0.5995314
##   0.0008000  0.7731122  0.4023048  0.5995314
##   0.0008001  0.7731122  0.4023048  0.5995314
##   0.0008002  0.7731122  0.4023048  0.5995314
##   0.0008003  0.7731122  0.4023048  0.5995314
##   0.0008004  0.7731122  0.4023048  0.5995314
##   0.0008005  0.7731122  0.4023048  0.5995314
##   0.0008006  0.7731122  0.4023048  0.5995314
##   0.0008007  0.7731122  0.4023048  0.5995314
##   0.0008008  0.7731122  0.4023048  0.5995314
##   0.0008009  0.7731122  0.4023048  0.5995314
##   0.0008010  0.7731122  0.4023048  0.5995314
##   0.0008011  0.7731122  0.4023048  0.5995314
##   0.0008012  0.7731122  0.4023048  0.5995314
##   0.0008013  0.7731122  0.4023048  0.5995314
##   0.0008014  0.7731122  0.4023048  0.5995314
##   0.0008015  0.7731122  0.4023048  0.5995314
##   0.0008016  0.7731122  0.4023048  0.5995314
##   0.0008017  0.7731122  0.4023048  0.5995314
##   0.0008018  0.7731122  0.4023048  0.5995314
##   0.0008019  0.7731122  0.4023048  0.5995314
##   0.0008020  0.7731122  0.4023048  0.5995314
##   0.0008021  0.7731122  0.4023048  0.5995315
##   0.0008022  0.7731122  0.4023048  0.5995315
##   0.0008023  0.7731122  0.4023048  0.5995315
##   0.0008024  0.7731122  0.4023048  0.5995315
##   0.0008025  0.7731122  0.4023048  0.5995315
##   0.0008026  0.7731122  0.4023048  0.5995315
##   0.0008027  0.7731122  0.4023048  0.5995315
##   0.0008028  0.7731122  0.4023048  0.5995315
##   0.0008029  0.7731122  0.4023048  0.5995315
##   0.0008030  0.7731122  0.4023048  0.5995315
##   0.0008031  0.7731122  0.4023048  0.5995315
##   0.0008032  0.7731122  0.4023048  0.5995315
##   0.0008033  0.7731122  0.4023048  0.5995315
##   0.0008034  0.7731122  0.4023048  0.5995315
##   0.0008035  0.7731122  0.4023048  0.5995315
##   0.0008036  0.7731122  0.4023048  0.5995315
##   0.0008037  0.7731122  0.4023048  0.5995315
##   0.0008038  0.7731122  0.4023048  0.5995315
##   0.0008039  0.7731122  0.4023048  0.5995315
##   0.0008040  0.7731122  0.4023048  0.5995315
##   0.0008041  0.7731122  0.4023048  0.5995315
##   0.0008042  0.7731122  0.4023048  0.5995315
##   0.0008043  0.7731122  0.4023048  0.5995315
##   0.0008044  0.7731122  0.4023048  0.5995315
##   0.0008045  0.7731122  0.4023048  0.5995315
##   0.0008046  0.7731122  0.4023048  0.5995315
##   0.0008047  0.7731122  0.4023048  0.5995315
##   0.0008048  0.7731122  0.4023048  0.5995315
##   0.0008049  0.7731122  0.4023048  0.5995315
##   0.0008050  0.7731122  0.4023048  0.5995315
##   0.0008051  0.7731122  0.4023048  0.5995316
##   0.0008052  0.7731122  0.4023048  0.5995316
##   0.0008053  0.7731122  0.4023048  0.5995316
##   0.0008054  0.7731122  0.4023048  0.5995316
##   0.0008055  0.7731122  0.4023048  0.5995316
##   0.0008056  0.7731122  0.4023048  0.5995316
##   0.0008057  0.7731122  0.4023048  0.5995316
##   0.0008058  0.7731122  0.4023048  0.5995316
##   0.0008059  0.7731122  0.4023048  0.5995316
##   0.0008060  0.7731122  0.4023048  0.5995316
##   0.0008061  0.7731122  0.4023048  0.5995316
##   0.0008062  0.7731122  0.4023048  0.5995316
##   0.0008063  0.7731122  0.4023048  0.5995316
##   0.0008064  0.7731122  0.4023048  0.5995316
##   0.0008065  0.7731122  0.4023048  0.5995316
##   0.0008066  0.7731122  0.4023048  0.5995316
##   0.0008067  0.7731122  0.4023048  0.5995316
##   0.0008068  0.7731122  0.4023048  0.5995316
##   0.0008069  0.7731122  0.4023048  0.5995316
##   0.0008070  0.7731122  0.4023048  0.5995316
##   0.0008071  0.7731122  0.4023048  0.5995316
##   0.0008072  0.7731122  0.4023048  0.5995316
##   0.0008073  0.7731122  0.4023048  0.5995316
##   0.0008074  0.7731122  0.4023048  0.5995316
##   0.0008075  0.7731122  0.4023048  0.5995316
##   0.0008076  0.7731122  0.4023048  0.5995316
##   0.0008077  0.7731122  0.4023048  0.5995316
##   0.0008078  0.7731122  0.4023048  0.5995316
##   0.0008079  0.7731122  0.4023048  0.5995316
##   0.0008080  0.7731122  0.4023048  0.5995316
##   0.0008081  0.7731122  0.4023048  0.5995316
##   0.0008082  0.7731122  0.4023048  0.5995317
##   0.0008083  0.7731122  0.4023048  0.5995317
##   0.0008084  0.7731122  0.4023048  0.5995317
##   0.0008085  0.7731122  0.4023048  0.5995317
##   0.0008086  0.7731122  0.4023048  0.5995317
##   0.0008087  0.7731122  0.4023048  0.5995317
##   0.0008088  0.7731122  0.4023048  0.5995317
##   0.0008089  0.7731122  0.4023048  0.5995317
##   0.0008090  0.7731122  0.4023048  0.5995317
##   0.0008091  0.7731122  0.4023048  0.5995317
##   0.0008092  0.7731122  0.4023048  0.5995317
##   0.0008093  0.7731122  0.4023048  0.5995317
##   0.0008094  0.7731122  0.4023048  0.5995317
##   0.0008095  0.7731122  0.4023048  0.5995317
##   0.0008096  0.7731122  0.4023048  0.5995317
##   0.0008097  0.7731122  0.4023048  0.5995317
##   0.0008098  0.7731122  0.4023048  0.5995317
##   0.0008099  0.7731122  0.4023048  0.5995317
##   0.0008100  0.7731122  0.4023048  0.5995317
##   0.0008101  0.7731122  0.4023048  0.5995317
##   0.0008102  0.7731122  0.4023048  0.5995317
##   0.0008103  0.7731122  0.4023048  0.5995317
##   0.0008104  0.7731122  0.4023048  0.5995317
##   0.0008105  0.7731122  0.4023048  0.5995317
##   0.0008106  0.7731122  0.4023048  0.5995317
##   0.0008107  0.7731122  0.4023048  0.5995317
##   0.0008108  0.7731122  0.4023048  0.5995317
##   0.0008109  0.7731122  0.4023048  0.5995317
##   0.0008110  0.7731122  0.4023048  0.5995317
##   0.0008111  0.7731122  0.4023048  0.5995317
##   0.0008112  0.7731122  0.4023048  0.5995317
##   0.0008113  0.7731122  0.4023048  0.5995318
##   0.0008114  0.7731122  0.4023048  0.5995318
##   0.0008115  0.7731122  0.4023048  0.5995318
##   0.0008116  0.7731122  0.4023048  0.5995318
##   0.0008117  0.7731122  0.4023048  0.5995318
##   0.0008118  0.7731122  0.4023048  0.5995318
##   0.0008119  0.7731122  0.4023048  0.5995318
##   0.0008120  0.7731122  0.4023048  0.5995318
##   0.0008121  0.7731122  0.4023048  0.5995318
##   0.0008122  0.7731122  0.4023048  0.5995318
##   0.0008123  0.7731122  0.4023048  0.5995318
##   0.0008124  0.7731122  0.4023048  0.5995318
##   0.0008125  0.7731122  0.4023048  0.5995318
##   0.0008126  0.7731122  0.4023048  0.5995318
##   0.0008127  0.7731122  0.4023048  0.5995318
##   0.0008128  0.7731122  0.4023048  0.5995318
##   0.0008129  0.7731122  0.4023048  0.5995318
##   0.0008130  0.7731122  0.4023048  0.5995318
##   0.0008131  0.7731122  0.4023048  0.5995318
##   0.0008132  0.7731122  0.4023048  0.5995318
##   0.0008133  0.7731122  0.4023048  0.5995318
##   0.0008134  0.7731122  0.4023048  0.5995318
##   0.0008135  0.7731122  0.4023048  0.5995318
##   0.0008136  0.7731122  0.4023048  0.5995318
##   0.0008137  0.7731122  0.4023048  0.5995318
##   0.0008138  0.7731122  0.4023048  0.5995318
##   0.0008139  0.7731122  0.4023048  0.5995318
##   0.0008140  0.7731122  0.4023048  0.5995318
##   0.0008141  0.7731122  0.4023048  0.5995318
##   0.0008142  0.7731122  0.4023048  0.5995318
##   0.0008143  0.7731122  0.4023048  0.5995318
##   0.0008144  0.7731122  0.4023048  0.5995319
##   0.0008145  0.7731122  0.4023048  0.5995319
##   0.0008146  0.7731122  0.4023048  0.5995319
##   0.0008147  0.7731122  0.4023048  0.5995319
##   0.0008148  0.7731122  0.4023048  0.5995319
##   0.0008149  0.7731122  0.4023048  0.5995319
##   0.0008150  0.7731122  0.4023048  0.5995319
##   0.0008151  0.7731122  0.4023048  0.5995319
##   0.0008152  0.7731122  0.4023048  0.5995319
##   0.0008153  0.7731122  0.4023048  0.5995319
##   0.0008154  0.7731122  0.4023048  0.5995319
##   0.0008155  0.7731122  0.4023048  0.5995319
##   0.0008156  0.7731122  0.4023048  0.5995319
##   0.0008157  0.7731122  0.4023048  0.5995319
##   0.0008158  0.7731122  0.4023048  0.5995319
##   0.0008159  0.7731122  0.4023048  0.5995319
##   0.0008160  0.7731122  0.4023048  0.5995319
##   0.0008161  0.7731122  0.4023048  0.5995319
##   0.0008162  0.7731122  0.4023048  0.5995319
##   0.0008163  0.7731122  0.4023048  0.5995319
##   0.0008164  0.7731122  0.4023048  0.5995319
##   0.0008165  0.7731122  0.4023048  0.5995319
##   0.0008166  0.7731122  0.4023048  0.5995319
##   0.0008167  0.7731122  0.4023048  0.5995319
##   0.0008168  0.7731122  0.4023048  0.5995319
##   0.0008169  0.7731122  0.4023048  0.5995319
##   0.0008170  0.7731122  0.4023048  0.5995319
##   0.0008171  0.7731122  0.4023048  0.5995319
##   0.0008172  0.7731122  0.4023048  0.5995319
##   0.0008173  0.7731122  0.4023048  0.5995319
##   0.0008174  0.7731122  0.4023048  0.5995319
##   0.0008175  0.7731122  0.4023048  0.5995320
##   0.0008176  0.7731122  0.4023048  0.5995320
##   0.0008177  0.7731122  0.4023048  0.5995320
##   0.0008178  0.7731122  0.4023048  0.5995320
##   0.0008179  0.7731122  0.4023048  0.5995320
##   0.0008180  0.7731122  0.4023048  0.5995320
##   0.0008181  0.7731122  0.4023048  0.5995320
##   0.0008182  0.7731122  0.4023048  0.5995320
##   0.0008183  0.7731122  0.4023048  0.5995320
##   0.0008184  0.7731122  0.4023048  0.5995320
##   0.0008185  0.7731122  0.4023048  0.5995320
##   0.0008186  0.7731122  0.4023048  0.5995320
##   0.0008187  0.7731122  0.4023048  0.5995320
##   0.0008188  0.7731122  0.4023048  0.5995320
##   0.0008189  0.7731122  0.4023048  0.5995320
##   0.0008190  0.7731122  0.4023048  0.5995320
##   0.0008191  0.7731122  0.4023048  0.5995320
##   0.0008192  0.7731122  0.4023048  0.5995320
##   0.0008193  0.7731122  0.4023048  0.5995320
##   0.0008194  0.7731122  0.4023048  0.5995320
##   0.0008195  0.7731122  0.4023048  0.5995320
##   0.0008196  0.7731122  0.4023048  0.5995320
##   0.0008197  0.7731122  0.4023048  0.5995320
##   0.0008198  0.7731122  0.4023048  0.5995320
##   0.0008199  0.7731122  0.4023048  0.5995320
##   0.0008200  0.7731122  0.4023048  0.5995320
##   0.0008201  0.7731122  0.4023048  0.5995320
##   0.0008202  0.7731122  0.4023048  0.5995320
##   0.0008203  0.7731122  0.4023048  0.5995320
##   0.0008204  0.7731122  0.4023048  0.5995320
##   0.0008205  0.7731122  0.4023048  0.5995320
##   0.0008206  0.7731122  0.4023048  0.5995321
##   0.0008207  0.7731122  0.4023048  0.5995321
##   0.0008208  0.7731122  0.4023048  0.5995321
##   0.0008209  0.7731122  0.4023048  0.5995321
##   0.0008210  0.7731122  0.4023048  0.5995321
##   0.0008211  0.7731122  0.4023048  0.5995321
##   0.0008212  0.7731122  0.4023048  0.5995321
##   0.0008213  0.7731122  0.4023048  0.5995321
##   0.0008214  0.7731122  0.4023048  0.5995321
##   0.0008215  0.7731122  0.4023048  0.5995321
##   0.0008216  0.7731122  0.4023048  0.5995321
##   0.0008217  0.7731122  0.4023048  0.5995321
##   0.0008218  0.7731122  0.4023048  0.5995321
##   0.0008219  0.7731122  0.4023048  0.5995321
##   0.0008220  0.7731122  0.4023048  0.5995321
##   0.0008221  0.7731122  0.4023048  0.5995321
##   0.0008222  0.7731122  0.4023048  0.5995321
##   0.0008223  0.7731122  0.4023048  0.5995321
##   0.0008224  0.7731122  0.4023048  0.5995321
##   0.0008225  0.7731122  0.4023048  0.5995321
##   0.0008226  0.7731122  0.4023048  0.5995321
##   0.0008227  0.7731122  0.4023048  0.5995321
##   0.0008228  0.7731122  0.4023048  0.5995321
##   0.0008229  0.7731122  0.4023048  0.5995321
##   0.0008230  0.7731122  0.4023048  0.5995321
##   0.0008231  0.7731122  0.4023048  0.5995321
##   0.0008232  0.7731122  0.4023048  0.5995321
##   0.0008233  0.7731122  0.4023048  0.5995321
##   0.0008234  0.7731122  0.4023048  0.5995321
##   0.0008235  0.7731122  0.4023048  0.5995321
##   0.0008236  0.7731122  0.4023048  0.5995321
##   0.0008237  0.7731122  0.4023048  0.5995322
##   0.0008238  0.7731122  0.4023048  0.5995322
##   0.0008239  0.7731122  0.4023048  0.5995322
##   0.0008240  0.7731122  0.4023048  0.5995322
##   0.0008241  0.7731122  0.4023048  0.5995322
##   0.0008242  0.7731122  0.4023048  0.5995322
##   0.0008243  0.7731122  0.4023048  0.5995322
##   0.0008244  0.7731122  0.4023048  0.5995322
##   0.0008245  0.7731122  0.4023048  0.5995322
##   0.0008246  0.7731122  0.4023048  0.5995322
##   0.0008247  0.7731122  0.4023048  0.5995322
##   0.0008248  0.7731122  0.4023048  0.5995322
##   0.0008249  0.7731122  0.4023048  0.5995322
##   0.0008250  0.7731122  0.4023048  0.5995322
##   0.0008251  0.7731122  0.4023048  0.5995322
##   0.0008252  0.7731122  0.4023048  0.5995322
##   0.0008253  0.7731122  0.4023048  0.5995322
##   0.0008254  0.7731122  0.4023048  0.5995322
##   0.0008255  0.7731122  0.4023048  0.5995322
##   0.0008256  0.7731122  0.4023048  0.5995322
##   0.0008257  0.7731122  0.4023048  0.5995322
##   0.0008258  0.7731122  0.4023048  0.5995322
##   0.0008259  0.7731122  0.4023048  0.5995322
##   0.0008260  0.7731122  0.4023048  0.5995322
##   0.0008261  0.7731122  0.4023048  0.5995322
##   0.0008262  0.7731122  0.4023048  0.5995322
##   0.0008263  0.7731122  0.4023048  0.5995322
##   0.0008264  0.7731122  0.4023048  0.5995322
##   0.0008265  0.7731122  0.4023048  0.5995322
##   0.0008266  0.7731122  0.4023048  0.5995322
##   0.0008267  0.7731122  0.4023048  0.5995322
##   0.0008268  0.7731123  0.4023048  0.5995323
##   0.0008269  0.7731123  0.4023048  0.5995323
##   0.0008270  0.7731123  0.4023048  0.5995323
##   0.0008271  0.7731123  0.4023048  0.5995323
##   0.0008272  0.7731123  0.4023048  0.5995323
##   0.0008273  0.7731123  0.4023048  0.5995323
##   0.0008274  0.7731123  0.4023048  0.5995323
##   0.0008275  0.7731123  0.4023048  0.5995323
##   0.0008276  0.7731123  0.4023048  0.5995323
##   0.0008277  0.7731123  0.4023048  0.5995323
##   0.0008278  0.7731123  0.4023048  0.5995323
##   0.0008279  0.7731123  0.4023048  0.5995323
##   0.0008280  0.7731123  0.4023048  0.5995323
##   0.0008281  0.7731123  0.4023048  0.5995323
##   0.0008282  0.7731123  0.4023048  0.5995323
##   0.0008283  0.7731123  0.4023048  0.5995323
##   0.0008284  0.7731123  0.4023048  0.5995323
##   0.0008285  0.7731123  0.4023048  0.5995323
##   0.0008286  0.7731123  0.4023048  0.5995323
##   0.0008287  0.7731123  0.4023048  0.5995323
##   0.0008288  0.7731123  0.4023048  0.5995323
##   0.0008289  0.7731123  0.4023048  0.5995323
##   0.0008290  0.7731123  0.4023048  0.5995323
##   0.0008291  0.7731123  0.4023048  0.5995323
##   0.0008292  0.7731123  0.4023048  0.5995323
##   0.0008293  0.7731123  0.4023048  0.5995323
##   0.0008294  0.7731123  0.4023048  0.5995323
##   0.0008295  0.7731123  0.4023048  0.5995323
##   0.0008296  0.7731123  0.4023048  0.5995323
##   0.0008297  0.7731123  0.4023048  0.5995323
##   0.0008298  0.7731123  0.4023048  0.5995323
##   0.0008299  0.7731123  0.4023048  0.5995324
##   0.0008300  0.7731123  0.4023048  0.5995324
## 
## Tuning parameter 'alpha' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda = 0.0007916.
predicted_test <- predict(or_mod_3, oregon_test)


RMSE_test <- RMSE(predicted_test, oregon_test$score)
cat("Test RMSEA is", RMSE_test, "\n")
## Test RMSEA is 2501.218
rsq_test <- cor(predicted_test, oregon_test$score)^2
cat("Test R-squared is", rsq_test, "\n")
## Test R-squared is 0.4011952
mae_test <- MAE(predicted_test, oregon_test$score)
cat("Test MAE is", mae_test, "\n")
## Test MAE is 2498.592
mod_3_stats <- c("Lasso Regression", RMSE_test, rsq_test, mae_test)

I tried the following parameters - 1) .001:3 by .01 (Used .001 for parameter) 2) .00002:.02 by 0001 (Used 0.00082) 3) 0.000005:.01 by .00005 (Used 0.000805) 4) 0.00079:0.00083 by 0.0000001 (Used 0.0007916)

Stopped testing due to negligible improvements in RMSEA/MAE. No improvement; more variability with test dataset.

Task 2.6

Evaluate the performance of the models in 2.2, 2.3, and 2.4 on the test dataset. Calculate and report the root mean squared error (RMSE), mean absolute error (MAE), and R-square. Summarize these numbers in a table like the following. Decide and comment on which model you would use to predict scores.

names <- c("Regression Model", "RMSE", "R-Squared", "MAE")
rbind(names, mod_1_stats, mod_2_stats, mod_3_stats)
##             [,1]                       [,2]               [,3]               
## names       "Regression Model"         "RMSE"             "R-Squared"        
## mod_1_stats "Unregularized Regression" "88.9664303339126" "0.401389773008871"
## mod_2_stats "Ridge Regression"         "2501.21841380854" "0.401296921657589"
## mod_3_stats "Lasso Regression"         "2501.21830079616" "0.401195181145662"
##             [,4]              
## names       "MAE"             
## mod_1_stats "69.1845396357428"
## mod_2_stats "2498.59224564397"
## mod_3_stats "2498.59229921243"

It seems unregularized model performs best, as it has least error.